library(tidyverse) # Manejo de Datos
── Attaching core tidyverse packages ──────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.1 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.1 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.1 ── Conflicts ────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(lubridate) # Manejo de Fechas
library(sp) # Clases Espaciales
library(spatstat) # spatial statistics
Loading required package: spatstat.data
Loading required package: spatstat.geom
spatstat.geom 3.2-1
Loading required package: spatstat.random
spatstat.random 3.1-5
Loading required package: spatstat.explore
Loading required package: nlme
Attaching package: ‘nlme’
The following object is masked from ‘package:dplyr’:
collapse
spatstat.explore 3.1-0
Loading required package: spatstat.model
Loading required package: rpart
spatstat.model 3.2-1
Loading required package: spatstat.linnet
spatstat.linnet 3.0-6
spatstat 3.0-3
For an introduction to spatstat, type ‘beginner’
library(spdep) # procesos puntuales
Loading required package: spData
Loading required package: sf
Linking to GEOS 3.6.2, GDAL 2.2.3, PROJ 4.9.3; sf_use_s2() is TRUE
library(sf) # spatial objects: Simple FEautures
library(terra) # spatial methods
terra 1.7.3
Attaching package: ‘terra’
The following objects are masked from ‘package:spatstat.geom’:
area, delaunay, rescale, rotate, shift, where.max,
where.min
The following object is masked from ‘package:tidyr’:
extract
library("leaflet") # Mapas interactivos
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
library(tmap) # Mapping
library(OpenStreetMap) # Mapping
library(splines) # Smoothing
library(dbscan) # LOF
Attaching package: ‘dbscan’
The following object is masked from ‘package:stats’:
as.dendrogram
library(solitude) # iForest
library(caret) # Seleccion de Modelos
Loading required package: lattice
Attaching package: ‘lattice’
The following object is masked from ‘package:spatstat.model’:
panel.histogram
The following object is masked from ‘package:spatstat.explore’:
panel.histogram
Attaching package: ‘caret’
The following object is masked from ‘package:purrr’:
lift
Cargamos las librerias necesarias para graficar
library(readxl)
library(ggplot2)
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
library(RColorBrewer)
library(ggthemes) # estilos de gráficos
library(ggrepel) # etiquetas de texto más prolijas que las de ggplot
library(scales) # tiene la función 'percent()'
library(gganimate) # Para hacer gráficos animados.
library(ggridges) # Para hacer gráficos de densidad faceteados
library(GGally) # Para hacer varios gráficos juntos.
library(cowplot) #Para unir gráficos generados por ggplot2
library(forcats) #Para reordenar factores
library(pyramid) # para pirámide poblacional
library(ggcorrplot) # para correlogramas
library(AER) # para datos
library(hexbin) # para gráfico de dispersión con intensidad por color
library(plotrix)
library(ggmosaic)#mosaicos
#library(webr)# circulares anidados
library(kableExtra)# para tablas
library(vioplot) # para gráficos de violÃn
library(ggpubr)
library(fmsb) #para gráfica de radar
library(mlbench) #data sets del repositorio de UCI
datos <- read.csv('/home/andresfaral/Documents/OSDE/Curso R/Ultima Clase/properati_con_outliers.csv')
datos
str(datos)
'data.frame': 75369 obs. of 37 variables:
$ X : int 0 1 2 3 4 5 6 7 8 9 ...
$ Unnamed..0 : int 0 1 2 3 4 5 6 7 8 9 ...
$ id : chr "U3qdJMKXnOJm0Y1tWpnnfg==" "gsQB/JzLxaQdBLfNcm/DMw==" "SlPt6GJRjM+cO4rD3n3HFQ==" "ZaH+6DXJ4MLM6QqZXhgWiw==" ...
$ lat : num -34.6 -34.6 -34.6 -34.6 -34.6 ...
$ lon : num -58.4 -58.4 -58.4 -58.4 -58.4 ...
$ l3 : chr "Retiro" "Almagro" "Palermo" "Palermo" ...
$ rooms : num 0 1 1 2 2 1 1 1 1 1 ...
$ bedrooms : num 0 0 0 0 0 0 0 0 0 0 ...
$ bathrooms : num 0 1 1 1 1 1 1 1 1 1 ...
$ surface_total : num 25 38 35 51 53 30 33 34 37 30 ...
$ surface_covered : num 25 31 30 46 53 30 33 34 37 30 ...
$ price : num 85000 110000 105000 150000 136500 ...
$ property_type : chr "Departamento" "Departamento" "Departamento" "Departamento" ...
$ coords : chr "(-34.5973636, -58.372987)" "(-34.6000044, -58.4171906)" "(-34.5816987, -58.4335472)" "(-34.5950443, -58.4425378)" ...
$ min_dist_subte : num 0.15 0.49 0.74 0.5 0.32 0.07 0.2 2.38 0.33 0.81 ...
$ min_dist_ferrocarril : num 0.87 1.23 0.61 2.2 0.98 0.26 1.73 0.68 1.77 1.24 ...
$ min_dist_farmacia : num 0.12 0.26 0.27 0.14 0.17 0.07 0.1 0.38 0.16 0.1 ...
$ min_dist_estadio : num 0.72 2.99 1.3 0.62 1.49 0.79 2.91 3.3 0.71 1.78 ...
$ nivel_ruido : chr "70-75 dBA" "65-70 dBA" "60-65 dBA" "60-65 dBA" ...
$ currency : chr "USD" "USD" "USD" "USD" ...
$ nivel_ruido_encoded : int 7 6 5 5 7 7 6 4 7 6 ...
$ id_encoded : int 38137 53236 36552 44609 65429 40203 35581 72222 57182 36053 ...
$ iforest_less_variables : int 1 1 1 1 1 1 1 1 1 1 ...
$ iforest_more_variables : int 1 1 1 1 1 1 1 1 1 1 ...
$ iforest_more_variables_scores: num -0.512 -0.432 -0.405 -0.422 -0.392 ...
$ iforest_less_variables_scores: num -0.574 -0.393 -0.387 -0.388 -0.398 ...
$ lof_less_variables : int 1 1 1 1 1 1 1 1 1 1 ...
$ lof_less_variables_scores : num -1.098 -1.078 -0.985 -0.992 -0.956 ...
$ lof_more_variables : int 1 1 1 1 1 1 1 1 1 1 ...
$ lof_more_variables_scores : num -1.543 -8.366 -1.06 -0.994 -1.15 ...
$ odin_more_variables_scores : int 1 18 51 14 17 35 45 21 40 7 ...
$ odin_more_variables : num 1 1 1 1 1 1 1 1 1 1 ...
$ odin_less_variables_scores : int 15 13 43 43 38 15 39 32 21 21 ...
$ odin_less_variables : num 1 1 1 1 1 1 1 1 1 1 ...
$ start_date : chr "2020-08-22" "2020-08-22" "2020-08-22" "2020-08-22" ...
$ end_date : chr "2020-09-04" "2020-09-04" "2020-09-04" "2020-09-04" ...
$ created_on : chr "2020-08-22" "2020-08-22" "2020-08-22" "2020-08-22" ...
datos.filt <- datos %>% # agarro los datos originales
mutate(Pm2=price/surface_covered, # creo nuevas variables
Ruido=factor(ifelse(nivel_ruido_encoded<5,"Bajo",ifelse(nivel_ruido_encoded>6,"Alto","Medio")),levels=c("Bajo","Medio","Alto")),
start_date=ymd(start_date), end_date=ymd(end_date),
created_on=ymd(created_on),
Mom=as.numeric(start_date),
Momento=(Mom-min(Mom))/(max(Mom)-min(Mom)),) %>%
filter(price>50000, # Filtro registros
price<999999,
rooms>0,
bedrooms>0,
bathrooms>0,
surface_covered>20,
surface_covered<300,
lat>(-34.8),lat<(-34.5),lon>(-58.6),lon<(-58.3)) %>%
select(id,lat, # Selecciono las variables con las que me quedo
lon,l3,start_date,end_date,
rooms,bathrooms,
bedrooms,Ruido,
surface_covered,
Pm2,
Momento,
price,min_dist_subte,min_dist_ferrocarril,min_dist_farmacia) %>%
rename(Barrio=l3,Lat=lat,
Lon=lon, # Cambio nombres
Rooms=rooms,
Baths=bathrooms,
Beds=bedrooms,
Sup=surface_covered,
Com=start_date,
Fin=end_date,DistFarm=min_dist_farmacia,
Precio=price,DistTren=min_dist_ferrocarril,DistSubte=min_dist_subte) %>%
distinct(id,.keep_all = T) %>% # remove duplicated ids
distinct(Lon,Lat,.keep_all = T) %>% # saco deptos con misma ubicacion
na.omit() #%>% # elimino faltantes
# sample_frac() %>% # aleatoriza las filas
# arrange(Pm2,Precio,Rooms) # Ordeno por precio
# Elimino deptos de barrios poco importantes
minimo<-100 # cantidad minima de deptos por barrio
rareBarrio <- datos.filt %>% count(Barrio) %>% filter(n < minimo) %>% pull(Barrio)
datos.filt <- datos.filt %>% filter(!Barrio %in% rareBarrio)
# veamos
datos.filt
ggplot(datos.filt,aes(x=Precio,y=..density..),xlim(0,300000))+
geom_histogram(bins=24,color="steelblue",fill="Aquamarine4")+
geom_density(fill="Aquamarine3", alpha=0.4,adjust=2)+
xlab("Precio") + #etiqueta del eje x
ylab("Frecuencia") + #etiqueta del eje y
ggtitle("Distribución de Precios de los Departamentos") + #tÃtulo del gráfico
theme_minimal() # le quitamos el fondo gris y los bordes al gráfico
Valores.medios <- datos.filt %>% group_by(Barrio) %>% select_if(.,is.numeric) %>% summarise_all(list(Media=mean,Mediana=median))
Pm2.por.barrio <- datos.filt %>% group_by(Barrio) %>% summarise(Cant=n(),Pm2.barrio=mean(Precio/Sup)) %>% arrange(Pm2.barrio)
Resumen <- Pm2.por.barrio %>% inner_join(Valores.medios)
Joining with `by = join_by(Barrio)`
#
Resumen.sel <- Resumen %>% select(Barrio,Sup_Media,Rooms_Media,Beds_Media,Baths_Media) %>% rename(Sup=Sup_Media,Rooms=Rooms_Media,Beds=Beds_Media,Baths=Baths_Media) %>% filter(Barrio %in% c("Almagro","Flores","Villa Lugano","Puerto Madero"))
Resumen.sel
rec_colors <- c("#F38181", "#FCE38A", "#85E9D3","#113366","#667744","#2299CC")
radarchart(
df = Resumen.sel[,-1],
maxmin = FALSE, # la función calcula sola el máximo y mÃnimo
pcol=rec_colors,
cglty = 1, # tipo de lÃnea para la grilla del radar
cglcol = "gray", # color de lÃnea para la grilla del radar
plwd = 3, # Ancho de lÃnea para las variables
plty = 1, # Tipo de lÃnea para las variables
)
legend("topleft", # posición de la leyenda
legend = Resumen.sel$Barrio, #nombres de la leyenda
bty = "o", #tipo de recuadro usado para la leyenda
pch = 16, #simbolos de la leyenda
col =rec_colors , # usar los mismos colores que en el radar
text.col = "Black",
pt.cex = 3 #tamaño de los puntos de la figura
)
require(ggridges)
datos.filt %>%
ggplot(aes(x=Pm2,
y=fct_reorder(Barrio,Pm2),
fill=Barrio))+
geom_density_ridges()+
theme(legend.position = "none")+
scale_x_continuous(trans="log10")+
labs(y="Barrio",x="Precio del Metro Cuacrado (PM2)") + xlim(500,7400)
Scale for x is already present.
Adding another scale for x, which will replace the existing scale.
ggplot(datos.filt, aes(x = Sup, y = Precio)) +
geom_hex(bins = 50) +
stat_smooth(method = "gam", se = T, color="magenta")+
theme_bw() +
xlab("Superficie") + #etiqueta del eje x
ylab("Precio") + #etiqueta del eje y
ggtitle("Relacion entre Precio y Superficie") #tÃtulo
Resumen.sel2 <- Resumen %>% select(Barrio,Beds_Media,Baths_Media,Pm2_Media) # %>% filter(Barrio %in% c("Almagro","Flores","Retiro","Puerto Madero"))
ggplot(data=Resumen.sel2, aes(x = Beds_Media, y = Baths_Media)) +
geom_point(aes(color = Barrio,size=Pm2_Media,alpha=0.5))+
scale_size_continuous(range = c(1, 12)) +
geom_text(aes(label = Barrio ),
size = 3, vjust = 1)+
theme_bw() + theme(legend.position="none") + geom_abline(intercept=0,slope=1) +
xlab("Dormitorios Promedio") + #etiqueta del eje x
ylab("Baños Promedio") + #etiqueta del eje y
ggtitle("Relacion entre Baños y Dormitorios por Barrio") #tÃtulo
ggplot(data = datos.filt , aes(x = Com, y = Pm2,color=Barrio,fill=Barrio)) +
# geom_hex(color = "#00AFBB",bins=40) +
stat_smooth(
method = "lm",
se=F
) + theme(legend.position="right",legend.key.size = unit(0.2, 'cm'),legend.key.height = unit(0.5, 'cm'),legend.text = element_text(size=8)) +
xlab("Fecha") + #etiqueta del eje x
ylab("Precio del Metro Cuadrado") + #etiqueta del eje y
ggtitle("Evolución Temporal del Pm2 por Barrio") #tÃtulo
require(gplots)
Loading required package: gplots
Attaching package: ‘gplots’
The following object is masked from ‘package:plotrix’:
plotCI
The following object is masked from ‘package:spatstat.geom’:
col2hex
The following object is masked from ‘package:stats’:
lowess
datos.filt.grandes<-datos.filt
df<-data.frame(Barrios=as.factor(datos.filt.grandes$Barrio),Ruidos=datos.filt.grandes$Ruido,Cuartos=as.factor(datos.filt.grandes$Rooms))
# heatmap escaldo por Cuarto
BarriosyCuartos<-xtabs(~Cuartos+Barrios,data=df)
heatmap.2(BarriosyCuartos,col = bluered,dendrogram = "none",,trace = "none",scale="col",cexCol = 0.6,margins=c(7,4),Rowv=NA,keysize = 1.5)
title("Cantidad de Ambientes por Barrio")
Explico el Precio con:
ajus.lineal<-lm(Precio~DistSubte+poly(Lon,3)*poly(Lat,3)+poly(Sup,2)+Momento+Baths+Beds+as.factor(Barrio),data =datos.filt)
#
summary(ajus.lineal)
Call:
lm(formula = Precio ~ DistSubte + poly(Lon, 3) * poly(Lat, 3) +
poly(Sup, 2) + Momento + Baths + Beds + as.factor(Barrio),
data = datos.filt)
Residuals:
Min 1Q Median 3Q Max
-617250 -30194 -1458 25075 624270
Coefficients:
Estimate Std. Error t value
(Intercept) 2.221e+05 3.903e+03 56.906
DistSubte 1.620e+04 1.340e+03 12.095
poly(Lon, 3)1 1.462e+07 5.098e+05 28.672
poly(Lon, 3)2 7.048e+06 3.861e+05 18.255
poly(Lon, 3)3 3.232e+06 2.626e+05 12.308
poly(Lat, 3)1 1.592e+07 5.592e+05 28.467
poly(Lat, 3)2 9.158e+06 3.578e+05 25.596
poly(Lat, 3)3 6.914e+06 4.430e+05 15.608
poly(Sup, 2)1 1.884e+07 1.315e+05 143.258
poly(Sup, 2)2 -1.063e+06 7.406e+04 -14.353
Momento -1.605e+04 1.335e+03 -12.024
Baths 3.967e+04 8.716e+02 45.517
Beds -1.765e+04 7.803e+02 -22.616
as.factor(Barrio)Balvanera -2.462e+04 3.223e+03 -7.638
as.factor(Barrio)Barracas 9.616e+03 8.969e+03 1.072
as.factor(Barrio)Barrio Norte -6.240e+03 3.429e+03 -1.820
as.factor(Barrio)Belgrano 3.804e+04 4.082e+03 9.319
as.factor(Barrio)Boca -2.506e+04 1.155e+04 -2.171
as.factor(Barrio)Boedo -1.086e+04 4.716e+03 -2.302
as.factor(Barrio)Caballito 4.440e+03 3.094e+03 1.435
as.factor(Barrio)Chacarita 1.537e+04 5.847e+03 2.629
as.factor(Barrio)Coghlan 3.426e+04 6.267e+03 5.467
as.factor(Barrio)Colegiales 2.244e+04 4.326e+03 5.187
as.factor(Barrio)Congreso -2.771e+04 5.388e+03 -5.143
as.factor(Barrio)Constitución -3.404e+04 7.442e+03 -4.574
as.factor(Barrio)Flores -3.054e+04 5.111e+03 -5.976
as.factor(Barrio)Floresta -2.431e+04 7.191e+03 -3.380
as.factor(Barrio)Liniers 4.305e+02 1.377e+04 0.031
as.factor(Barrio)Mataderos -3.229e+04 1.266e+04 -2.551
as.factor(Barrio)Monserrat -1.379e+04 5.284e+03 -2.609
as.factor(Barrio)Monte Castro -2.863e+04 9.436e+03 -3.034
as.factor(Barrio)Nuñez 4.451e+04 5.587e+03 7.967
as.factor(Barrio)Once -3.922e+04 5.795e+03 -6.768
as.factor(Barrio)Palermo 3.153e+04 3.047e+03 10.347
as.factor(Barrio)Parque Chacabuco -2.359e+04 5.624e+03 -4.195
as.factor(Barrio)Parque Patricios -2.006e+04 6.805e+03 -2.947
as.factor(Barrio)Paternal -3.488e+04 6.645e+03 -5.249
as.factor(Barrio)Puerto Madero 2.685e+05 7.996e+03 33.579
as.factor(Barrio)Recoleta -4.204e+03 3.296e+03 -1.276
as.factor(Barrio)Retiro -3.187e+04 5.285e+03 -6.029
as.factor(Barrio)Saavedra 3.517e+04 6.774e+03 5.192
as.factor(Barrio)San Cristobal -1.177e+04 4.772e+03 -2.466
as.factor(Barrio)San Nicolás -3.143e+04 4.916e+03 -6.393
as.factor(Barrio)San Telmo 1.012e+04 6.621e+03 1.528
as.factor(Barrio)Villa Crespo 1.372e+03 3.005e+03 0.456
as.factor(Barrio)Villa del Parque -2.562e+04 6.102e+03 -4.198
as.factor(Barrio)Villa Devoto 2.423e+03 8.428e+03 0.287
as.factor(Barrio)Villa General Mitre -3.265e+04 7.481e+03 -4.365
as.factor(Barrio)Villa Lugano -1.158e+05 1.682e+04 -6.886
as.factor(Barrio)Villa Luro -8.494e+03 1.002e+04 -0.848
as.factor(Barrio)Villa Ortuzar 2.348e+04 7.096e+03 3.309
as.factor(Barrio)Villa Pueyrredón 1.002e+04 7.697e+03 1.302
as.factor(Barrio)Villa Santa Rita -2.645e+04 7.962e+03 -3.322
as.factor(Barrio)Villa Urquiza 4.539e+04 5.114e+03 8.876
poly(Lon, 3)1:poly(Lat, 3)1 3.308e+09 1.225e+08 26.998
poly(Lon, 3)2:poly(Lat, 3)1 1.931e+09 8.698e+07 22.199
poly(Lon, 3)3:poly(Lat, 3)1 6.397e+08 5.704e+07 11.214
poly(Lon, 3)1:poly(Lat, 3)2 1.827e+09 7.838e+07 23.317
poly(Lon, 3)2:poly(Lat, 3)2 9.162e+08 5.648e+07 16.222
poly(Lon, 3)3:poly(Lat, 3)2 3.339e+08 3.818e+07 8.744
poly(Lon, 3)1:poly(Lat, 3)3 1.346e+09 8.384e+07 16.058
poly(Lon, 3)2:poly(Lat, 3)3 6.321e+08 5.432e+07 11.637
poly(Lon, 3)3:poly(Lat, 3)3 2.410e+08 3.813e+07 6.320
Pr(>|t|)
(Intercept) < 2e-16 ***
DistSubte < 2e-16 ***
poly(Lon, 3)1 < 2e-16 ***
poly(Lon, 3)2 < 2e-16 ***
poly(Lon, 3)3 < 2e-16 ***
poly(Lat, 3)1 < 2e-16 ***
poly(Lat, 3)2 < 2e-16 ***
poly(Lat, 3)3 < 2e-16 ***
poly(Sup, 2)1 < 2e-16 ***
poly(Sup, 2)2 < 2e-16 ***
Momento < 2e-16 ***
Baths < 2e-16 ***
Beds < 2e-16 ***
as.factor(Barrio)Balvanera 2.28e-14 ***
as.factor(Barrio)Barracas 0.283668
as.factor(Barrio)Barrio Norte 0.068810 .
as.factor(Barrio)Belgrano < 2e-16 ***
as.factor(Barrio)Boca 0.029973 *
as.factor(Barrio)Boedo 0.021325 *
as.factor(Barrio)Caballito 0.151385
as.factor(Barrio)Chacarita 0.008571 **
as.factor(Barrio)Coghlan 4.62e-08 ***
as.factor(Barrio)Colegiales 2.15e-07 ***
as.factor(Barrio)Congreso 2.72e-07 ***
as.factor(Barrio)Constitución 4.81e-06 ***
as.factor(Barrio)Flores 2.32e-09 ***
as.factor(Barrio)Floresta 0.000725 ***
as.factor(Barrio)Liniers 0.975054
as.factor(Barrio)Mataderos 0.010732 *
as.factor(Barrio)Monserrat 0.009077 **
as.factor(Barrio)Monte Castro 0.002414 **
as.factor(Barrio)Nuñez 1.69e-15 ***
as.factor(Barrio)Once 1.34e-11 ***
as.factor(Barrio)Palermo < 2e-16 ***
as.factor(Barrio)Parque Chacabuco 2.73e-05 ***
as.factor(Barrio)Parque Patricios 0.003208 **
as.factor(Barrio)Paternal 1.54e-07 ***
as.factor(Barrio)Puerto Madero < 2e-16 ***
as.factor(Barrio)Recoleta 0.202129
as.factor(Barrio)Retiro 1.67e-09 ***
as.factor(Barrio)Saavedra 2.09e-07 ***
as.factor(Barrio)San Cristobal 0.013668 *
as.factor(Barrio)San Nicolás 1.65e-10 ***
as.factor(Barrio)San Telmo 0.126501
as.factor(Barrio)Villa Crespo 0.648067
as.factor(Barrio)Villa del Parque 2.70e-05 ***
as.factor(Barrio)Villa Devoto 0.773750
as.factor(Barrio)Villa General Mitre 1.27e-05 ***
as.factor(Barrio)Villa Lugano 5.86e-12 ***
as.factor(Barrio)Villa Luro 0.396475
as.factor(Barrio)Villa Ortuzar 0.000939 ***
as.factor(Barrio)Villa Pueyrredón 0.192947
as.factor(Barrio)Villa Santa Rita 0.000895 ***
as.factor(Barrio)Villa Urquiza < 2e-16 ***
poly(Lon, 3)1:poly(Lat, 3)1 < 2e-16 ***
poly(Lon, 3)2:poly(Lat, 3)1 < 2e-16 ***
poly(Lon, 3)3:poly(Lat, 3)1 < 2e-16 ***
poly(Lon, 3)1:poly(Lat, 3)2 < 2e-16 ***
poly(Lon, 3)2:poly(Lat, 3)2 < 2e-16 ***
poly(Lon, 3)3:poly(Lat, 3)2 < 2e-16 ***
poly(Lon, 3)1:poly(Lat, 3)3 < 2e-16 ***
poly(Lon, 3)2:poly(Lat, 3)3 < 2e-16 ***
poly(Lon, 3)3:poly(Lat, 3)3 2.65e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 65810 on 28407 degrees of freedom
Multiple R-squared: 0.8227, Adjusted R-squared: 0.8223
F-statistic: 2126 on 62 and 28407 DF, p-value: < 2.2e-16
Pred<-predict(ajus.lineal)
datos.filt$Pred.lm<-Pred
# R2
elsum<-summary(ajus.lineal)
R2.lm<-elsum$r.squared
# Observed Vs predicted
ggplot(datos.filt, aes(x = Pred.lm, y = Precio)) +
geom_hex(bins = 30) +
stat_smooth(method = "gam", se = T, color="magenta")+
theme_bw() +
xlab("Precio Predicho") + #etiqueta del eje x
ylab("Precio Observado") + #etiqueta del eje y
ggtitle(paste("Precio Observado Vs. Predicho por LM - R2 =",as.character(round(R2.lm,2))))+ #tÃtulo
geom_abline(intercept = 0,slope = 1,color="blue")
# PMAE
pmae.lineal<-mean((abs(datos.filt$Precio-Pred))/datos.filt$Precio)
pmae.lineal
[1] 0.225693
Particion en Train - Test
# Set train and test datasets
set.seed(1)
trainMN <- datos.filt %>%
dplyr::sample_frac(size = .75,replace = FALSE)
testMN <- datos.filt %>% filter(!id %in% trainMN$id)
#
datosMNX <- useful::build.x(Precio~DistSubte+Lon+Lat+Sup+Momento+Baths+Beds+as.factor(Barrio) - 1, datos.filt, contrasts = FALSE)
trainMNX <- useful::build.x(Precio~DistSubte+Lon+Lat+Sup+Momento+Baths+Beds+as.factor(Barrio) - 1, trainMN, contrasts = FALSE)
trainMNY <- useful::build.y(Precio~DistSubte+Lon+Lat+Sup+Momento+Baths+Beds+as.factor(Barrio) - 1, trainMN)
testMNX <- useful::build.x(Precio~DistSubte+Lon+Lat+Sup+Momento+Baths+Beds+as.factor(Barrio) - 1, testMN, contrasts = FALSE)
testMNY <- useful::build.y(Precio~DistSubte+Lon+Lat+Sup+Momento+Baths+Beds+as.factor(Barrio) - 1, testMN)
dim(datosMNX)
[1] 28470 49
dim(trainMNX)
[1] 21352 49
dim(testMNX)
[1] 7118 49
Entrenamiento con XGBoost
grid_default <- expand.grid(
nrounds = 150,
max_depth = 6,
eta = 0.1,
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
train_control <- caret::trainControl(
method = "none",
verboseIter = FALSE, # no training log
allowParallel = TRUE
)
xgb_base <- caret::train(
x = trainMNX,
y = trainMNY,
trControl = train_control,
tuneGrid = grid_default,
method = "xgbTree",
verbose = TRUE,
nthreads = 4
)
[17:12:49] WARNING: src/learner.cc:767:
Parameters: { "nthreads" } are not used.
#
basePred <- predict(xgb_base,testMNX)
baseRMSE <- caret::RMSE(basePred, testMNY)
baseRMSE
[1] 57675.72
# Calculo de predichos
datos.filt$Pred<-predict(xgb_base,datosMNX)
# R2
sstot<-sum((testMN$Precio-mean(testMN$Precio))**2)
ssres<-sum((testMN$Precio-basePred)**2)
R2.xgb<-1-(ssres)/(sstot)
R2.xgb
[1] 0.8646959
# Observed Vs predicted
ggplot(datos.filt, aes(x = Pred, y = Precio)) +
geom_hex(bins = 30) +
stat_smooth(method = "gam", se = T, color="magenta")+
theme_bw() +
xlab("Precio Predicho") + #etiqueta del eje x
ylab("Precio Observado") + #etiqueta del eje y
ggtitle(paste("Precio Observado Vs. Predicho por XGBoost - R2 =",as.character(round(R2.xgb,2))))+ #tÃtulo
geom_abline(intercept = 0,slope = 1,color="blue")
# PMAE
pmae.xgb<-mean((abs(testMN$Precio-basePred))/testMN$Precio)
pmae.xgb
[1] 0.1829259
# Importancia de las variables
importancia<-varImp(xgb_base)
plot(importancia,10,main="Importancia de Variables de XGBoost")
# Calculo de Residuos: Obs - Pred
datos.filt$Res<-datos.filt$Precio-datos.filt$Pred
# Calculo de Residuos Relativos: (Obs - Pred) / Obs
datos.filt$ResidRel<-datos.filt$Res/datos.filt$Precio
# Residuos relativos topeados
tope<-0.0025 #
qbajo<-quantile(datos.filt$ResidRel,tope)
qalto<-quantile(datos.filt$ResidRel,1-tope)
resrel.mod<-datos.filt$ResidRel
resrel.mod[resrel.mod<qbajo]<-qbajo
resrel.mod[resrel.mod>qalto]<-qalto
datos.filt$ResidRelTop<-resrel.mod
# Residuos Topeados Balanceados: Equiparo los rangos positivos y negativos
resrel.min<-abs(min(datos.filt$ResidRelTop))
resrel.max<-max(datos.filt$ResidRelTop)
resrel.bal<-ifelse(datos.filt$ResidRelTop<=0,datos.filt$ResidRelTop/resrel.min,datos.filt$ResidRelTop/resrel.max)
datos.filt$ResidRelBal<-resrel.bal
# Visualizacion Residuos Relativos y Topeados
# Comparacion de Residuos
ggplot(datos.filt)+
geom_histogram(aes(x=ResidRelTop,y=..density..),bins=90,color="steelblue",fill="Aquamarine4")+
geom_density(aes(x=ResidRel,y=..density..),fill="Aquamarine3",color="red", alpha=0.4,adjust=2) + xlim(-1,1) +
xlab("Residuo Relativos") + #etiqueta del eje x
ylab("Frecuencia") + #etiqueta del eje y
ggtitle("Distribución de los Residuos Relativos") #tÃtulo del gráfico
# Visualizacion Residuos Balanceados
ggplot(datos.filt)+
geom_histogram(aes(x=ResidRelBal,y=..density..),bins=90,color="steelblue",fill="Aquamarine4")+
geom_density(aes(x=ResidRel,y=..density..),fill="Aquamarine3",color="red", alpha=0.4,adjust=2) + xlim(-1,1) +
xlab("Residuo Relativos") + #etiqueta del eje x
ylab("Frecuencia") + #etiqueta del eje y
ggtitle("Distribución de los Residuos Balanceados") #tÃtulo del gráfico
# Ordeno residyuos relativos
coord_cartesian(xlim=c(-1,1))
<ggproto object: Class CoordCartesian, Coord, gg>
aspect: function
backtransform_range: function
clip: on
default: FALSE
distance: function
expand: TRUE
is_free: function
is_linear: function
labels: function
limits: list
modify_scales: function
range: function
render_axis_h: function
render_axis_v: function
render_bg: function
render_fg: function
setup_data: function
setup_layout: function
setup_panel_guides: function
setup_panel_params: function
setup_params: function
train_panel_guides: function
transform: function
super: <ggproto object: Class CoordCartesian, Coord, gg>
orden<-order(datos.filt$ResidRel)
cuantos<-100
# los residyuos relativos mas bajos
datos.filt %>% slice(orden[1:cuantos]) %>% select(Precio,Pred,Pred.lm,Sup,Pm2,Rooms,Baths,Beds,Barrio)
# los residyuos relativos mas altos
datos.filt %>% slice(rev(orden)[1:cuantos]) %>% select(Precio,Pred,Pred.lm,Sup,Pm2,Rooms,Baths,Beds,Barrio)
require(boot)
Loading required package: boot
Attaching package: ‘boot’
The following object is masked from ‘package:sm’:
dogs
The following object is masked from ‘package:survival’:
aml
The following object is masked from ‘package:car’:
logit
The following object is masked from ‘package:lattice’:
melanoma
The following object is masked from ‘package:spatstat.explore’:
envelope
grados<-10
deltas <- rep(NA, grados)
for (i in 1:grados)
{
#i<-3
fit <- glm(ResidRel~poly(Lon,i)*poly(Lat,i),data=datos.filt)
deltas[i] <- boot::cv.glm(datos.filt, fit, K = 10)$delta[1]
}
Ajus.Esp.df<-data.frame(Df=(1:grados),Error=deltas)
# Visualizacion
ggplot(Ajus.Esp.df, aes(x=Df,y=Error)) + geom_line() +
xlab("Grados de Libertad") + #etiqueta del eje x
ylab("Error por CV") + #etiqueta del eje y
ggtitle("Estructura Espacial de los Resoduos") #tÃtulo
# Modelo Optimo
ajus.lm.tend<-lm(ResidRelTop~poly(Lon,6)*poly(Lat,6),data=datos.filt)
#ajus.lm.tend<-lm(ResidRelTop~bs(Lon,df=6)*bs(Lat,df=6),data=datos.filt)
summary(ajus.lm.tend)
Call:
lm(formula = ResidRelTop ~ poly(Lon, 6) * poly(Lat, 6), data = datos.filt)
Residuals:
Min 1Q Median 3Q Max
-0.94804 -0.12184 0.02051 0.14439 0.54349
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.852e-01 2.035e-01 -2.384 0.0171 *
poly(Lon, 6)1 -1.111e+02 5.576e+01 -1.992 0.0464 *
poly(Lon, 6)2 -1.038e+02 6.183e+01 -1.679 0.0931 .
poly(Lon, 6)3 -7.039e+01 5.024e+01 -1.401 0.1612
poly(Lon, 6)4 -3.168e+01 3.360e+01 -0.943 0.3457
poly(Lon, 6)5 -8.908e+00 1.411e+01 -0.631 0.5278
poly(Lon, 6)6 -6.906e-01 6.088e+00 -0.113 0.9097
poly(Lat, 6)1 -1.026e+02 5.218e+01 -1.966 0.0493 *
poly(Lat, 6)2 -1.586e+02 6.957e+01 -2.280 0.0226 *
poly(Lat, 6)3 -5.764e+01 3.622e+01 -1.591 0.1115
poly(Lat, 6)4 -1.050e+02 4.566e+01 -2.299 0.0215 *
poly(Lat, 6)5 -2.309e+01 1.272e+01 -1.815 0.0695 .
poly(Lat, 6)6 -2.430e+01 9.943e+00 -2.444 0.0145 *
poly(Lon, 6)1:poly(Lat, 6)1 -2.682e+04 1.436e+04 -1.868 0.0618 .
poly(Lon, 6)2:poly(Lat, 6)1 -2.537e+04 1.607e+04 -1.579 0.1144
poly(Lon, 6)3:poly(Lat, 6)1 -1.831e+04 1.310e+04 -1.398 0.1623
poly(Lon, 6)4:poly(Lat, 6)1 -8.699e+03 8.804e+03 -0.988 0.3231
poly(Lon, 6)5:poly(Lat, 6)1 -3.146e+03 3.682e+03 -0.855 0.3928
poly(Lon, 6)6:poly(Lat, 6)1 -4.469e+02 1.602e+03 -0.279 0.7802
poly(Lon, 6)1:poly(Lat, 6)2 -3.857e+04 1.875e+04 -2.057 0.0397 *
poly(Lon, 6)2:poly(Lat, 6)2 -3.334e+04 2.042e+04 -1.633 0.1025
poly(Lon, 6)3:poly(Lat, 6)2 -2.005e+04 1.648e+04 -1.217 0.2237
poly(Lon, 6)4:poly(Lat, 6)2 -5.586e+03 1.108e+04 -0.504 0.6141
poly(Lon, 6)5:poly(Lat, 6)2 -2.380e+02 4.707e+03 -0.051 0.9597
poly(Lon, 6)6:poly(Lat, 6)2 1.590e+03 2.083e+03 0.763 0.4453
poly(Lon, 6)1:poly(Lat, 6)3 -1.598e+04 9.931e+03 -1.609 0.1075
poly(Lon, 6)2:poly(Lat, 6)3 -1.494e+04 1.125e+04 -1.329 0.1840
poly(Lon, 6)3:poly(Lat, 6)3 -1.215e+04 9.301e+03 -1.306 0.1915
poly(Lon, 6)4:poly(Lat, 6)3 -5.757e+03 6.412e+03 -0.898 0.3693
poly(Lon, 6)5:poly(Lat, 6)3 -2.916e+03 2.756e+03 -1.058 0.2899
poly(Lon, 6)6:poly(Lat, 6)3 -8.531e+02 1.241e+03 -0.688 0.4917
poly(Lon, 6)1:poly(Lat, 6)4 -2.512e+04 1.216e+04 -2.066 0.0388 *
poly(Lon, 6)2:poly(Lat, 6)4 -2.006e+04 1.307e+04 -1.535 0.1248
poly(Lon, 6)3:poly(Lat, 6)4 -1.084e+04 1.048e+04 -1.035 0.3009
poly(Lon, 6)4:poly(Lat, 6)4 -1.205e+03 7.105e+03 -0.170 0.8654
poly(Lon, 6)5:poly(Lat, 6)4 1.001e+03 3.029e+03 0.330 0.7412
poly(Lon, 6)6:poly(Lat, 6)4 1.599e+03 1.398e+03 1.144 0.2526
poly(Lon, 6)1:poly(Lat, 6)5 -5.851e+03 3.444e+03 -1.699 0.0894 .
poly(Lon, 6)2:poly(Lat, 6)5 -4.443e+03 3.812e+03 -1.165 0.2439
poly(Lon, 6)3:poly(Lat, 6)5 -3.335e+03 3.061e+03 -1.089 0.2759
poly(Lon, 6)4:poly(Lat, 6)5 -1.113e+03 2.056e+03 -0.541 0.5883
poly(Lon, 6)5:poly(Lat, 6)5 -5.203e+02 8.416e+02 -0.618 0.5364
poly(Lon, 6)6:poly(Lat, 6)5 -6.416e+00 3.891e+02 -0.016 0.9868
poly(Lon, 6)1:poly(Lat, 6)6 -5.436e+03 2.570e+03 -2.115 0.0344 *
poly(Lon, 6)2:poly(Lat, 6)6 -3.956e+03 2.683e+03 -1.475 0.1403
poly(Lon, 6)3:poly(Lat, 6)6 -1.441e+03 2.142e+03 -0.673 0.5012
poly(Lon, 6)4:poly(Lat, 6)6 3.668e+02 1.504e+03 0.244 0.8072
poly(Lon, 6)5:poly(Lat, 6)6 3.598e+02 6.554e+02 0.549 0.5829
poly(Lon, 6)6:poly(Lat, 6)6 2.562e+02 3.341e+02 0.767 0.4431
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2149 on 28421 degrees of freedom
Multiple R-squared: 0.006681, Adjusted R-squared: 0.005003
F-statistic: 3.982 on 48 and 28421 DF, p-value: < 2.2e-16
De ahora en mas se trabaja con clases espaciales:
# Load CABA borders
provincia.comp <- st_read("/home/andresfaral/Documents/Estadistica Espacial/Provincia/")
Reading layer `ign_provincia' from data source
`/home/andresfaral/Documents/Estadistica Espacial/Provincia'
using driver `ESRI Shapefile'
Simple feature collection with 24 features and 11 fields
Geometry type: MULTIPOLYGON
Dimension: XYZ
Bounding box: xmin: -74 ymin: -90 xmax: -25 ymax: -21.78086
z_range: zmin: 0 zmax: 0
CRS: 4326
caba.ch<-st_convex_hull(provincia.comp[1,]$geometry)
#plot(caba.ch)
coo<-st_coordinates(caba.ch)[-35,1:2]
caba.win<-owin(poly=list(x=rev(coo[,1]),y=rev(coo[,2])))
#plot(caba.win)
datos.sf <- datos.filt %>% st_as_sf(coords = c("Lon", "Lat"), crs = 4326)
distancias<-st_distance(datos.sf,caba.ch)
st_as_s2(): dropping Z and/or M coordinate
quedan<-(as.numeric(distancias)==0)
sum(quedan)
[1] 28469
datos.sf<-datos.sf[quedan,]
datos.sf
Simple feature collection with 28469 features and 21 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -58.52982 ymin: -34.69437 xmax: -58.34395 ymax: -34.536
CRS: EPSG:4326
First 10 features:
id Barrio Com Fin Rooms
1 IME5Nrzj5aNaSBjFhpgvdw== Once 2020-08-22 2020-09-04 2
2 93ZVIVml1tfP8xkxkDRRBg== Coghlan 2020-08-22 2020-09-04 2
3 EAhKIB69L2XPTuFgPi8SDw== Palermo 2020-08-22 2020-09-04 2
4 m6q5Fz9rkYidoCmTq4ADgQ== Palermo 2020-08-22 2020-09-04 2
5 R513B5GTU9wcT2atsVozYQ== Palermo 2020-08-22 2020-09-04 2
6 Uu6NJkZpbwC6p5PzPovFfg== Palermo 2020-08-22 2020-09-04 2
7 vpgqe4cxZyh5+LQjA3GFKA== Belgrano 2020-08-22 2020-09-04 2
8 geQdN3EWS2nnZNG08HoFMQ== Belgrano 2020-08-22 2020-09-04 2
9 I0SosBpnfl0i3gyrIvmfSw== Villa Devoto 2020-08-22 2020-09-04 2
10 uFk/f/OoOI3e3djHiekIEA== Villa Urquiza 2020-08-22 2020-09-04 2
Baths Beds Ruido Sup Pm2 Momento Precio DistSubte DistTren
1 1 1 Medio 40 2725.000 0.2313625 109000 0.06 0.56
2 1 1 Medio 42 3380.952 0.2313625 142000 1.64 0.72
3 1 1 Medio 45 4764.444 0.2313625 214400 0.46 0.39
4 1 1 Bajo 30 3833.333 0.2313625 115000 0.34 0.97
5 1 1 Medio 44 5795.455 0.2313625 255000 0.43 0.69
6 1 1 Alto 59 5016.949 0.2313625 296000 0.40 1.14
7 1 1 Bajo 41 3365.854 0.2313625 138000 1.48 0.38
8 1 1 Medio 42 3333.333 0.2313625 140000 0.97 0.56
9 1 1 Bajo 41 1804.878 0.2313625 74000 4.39 1.88
10 1 1 Bajo 42 3902.381 0.2313625 163900 0.87 0.72
DistFarm Pred.lm Pred Res ResidRel ResidRelTop
1 0.06 46299.09 85048.74 23951.258 0.21973631 0.21973631
2 0.08 128740.31 124873.07 17126.930 0.12061218 0.12061218
3 0.38 158761.08 165736.34 48663.656 0.22697601 0.22697601
4 0.05 124876.68 95872.06 19127.938 0.16632989 0.16632989
5 0.11 165059.19 148445.36 106554.641 0.41786134 0.41786134
6 0.30 243048.68 209872.19 86127.812 0.29097234 0.29097234
7 0.11 140193.44 125757.18 12242.820 0.08871609 0.08871609
8 0.16 192253.46 147131.77 -7131.766 -0.05094118 -0.05094118
9 0.20 116158.97 103541.61 -29541.609 -0.39921094 -0.39921094
10 0.29 131003.49 127819.89 36080.109 0.22013490 0.22013490
ResidRelBal geometry
1 0.47401747 POINT (-58.41465 -34.61043)
2 0.26018586 POINT (-58.48019 -34.55719)
3 0.48963502 POINT (-58.43904 -34.57758)
4 0.35880858 POINT (-58.41959 -34.58556)
5 0.90141484 POINT (-58.42327 -34.58488)
6 0.62768857 POINT (-58.41521 -34.58159)
7 0.19137928 POINT (-58.47165 -34.5668)
8 -0.05335444 POINT (-58.44351 -34.56119)
9 -0.41812294 POINT (-58.51557 -34.61937)
10 0.47487730 POINT (-58.48647 -34.56645)
Reescribo datos.filt sacando una obs que quedó afuera de CABA
datos.filt<-datos.filt[quedan,]
datos.filt
Conversion a ppp con Limites de CABA
# Conversion a ppp
datos.ppp <- as.ppp(st_coordinates(datos.sf), caba.win)
marks(datos.ppp)
NULL
window(datos.ppp)
Planar point pattern: 5 points
window: polygonal boundary
enclosing rectangle: [-58.53147, -58.33515] x [-34.70531, -34.52653]
units
# Adding marks
marks(datos.ppp)<-as.factor(datos.sf$ResidRel>0)
Usando objeto sf
# Paleta de colores
pal <- colorNumeric(palette = "RdBu", domain = c(min(datos.sf$ResidRelBal),max(datos.sf$ResidRelBal)))
# descriptive character field
datos.sf$Desc<-paste("Barrio",datos.filt$Barrio,"Sup",datos.filt$Sup,"Beds",datos.filt$Beds,"Baths",datos.filt$Baths,"<br>Precio",as.character(datos.filt$Precio),"Pred",as.character(round(datos.filt$Pred,2)),"ResRel",as.character(round(datos.filt$ResidRel,2)))
# Mapa
leaflet(datos.sf) %>% addTiles() %>% addCircleMarkers(fillOpacity = 0.5,weight=1,radius=~abs(ResidRelBal*20),color = ~pal(ResidRelBal),popup = ~Desc,label=~paste("Resid",as.character(round(ResidRel,2))))
require(prim)
x.data<-datos.filt[,c("Lon","Lat")]
y.data<-datos.filt[,c("ResidRelTop")]
#Resid.prim <- prim.box(x=x.data,y=y.data, threshold.type=1)
# Busco ResidRel positivos (sobre-valuados)
#Resid.prim <- prim.box(x=x.data, y=y.data, peel.alpha = 0.05, mass.min = 0.0005, threshold.type=1)
# Busco ResidRel negativos (sub-valuados)
Resid.prim <- prim.box(x=x.data, y=y.data, peel.alpha = 0.05, mass.min = 0.0005, threshold.type=-1)
# Grfico de valores de la funcion
#summary(Resid.prim, print.box=TRUE)
plot(unlist(Resid.prim$y.fun))
# ordebo cajas por valores de funcion - SUB-valuados
orden<-order(unlist(Resid.prim$y.fun))
# ordebo cajas por valores de funcion - SoBre-valuados
#orden<-rev(order(unlist(Resid.prim$y.fun)))
# Gusrdo los primeros
cant.cajas<-10
Valores<-Resid.prim$y.fun[orden[1:cant.cajas]]
Cantidades<-Resid.prim$mass[orden[1:cant.cajas]]
Cajas<-Resid.prim$box[orden[1:cant.cajas]]
# Armo los poligonos de las cajas elegidas
i<-1
caja.eleg<-Cajas[[i]]
lon = caja.eleg[,1]
lat = caja.eleg[,2]
Poly_Coord_df = data.frame(lon, lat)
pol = st_polygon( list(
cbind(
Poly_Coord_df$lon[c(1,2,2,1,1)],
Poly_Coord_df$lat[c(1,1,2,2,1)])
))
polc = st_sfc(pol, crs=4326)
polc.todos<-polc
for (i in 2:cant.cajas)
{
caja.eleg<-Cajas[[i]]
lon = caja.eleg[,1]
lat = caja.eleg[,2]
Poly_Coord_df = data.frame(lon, lat)
pol = st_polygon( list(
cbind(
Poly_Coord_df$lon[c(1,2,2,1,1)],
Poly_Coord_df$lat[c(1,1,2,2,1)])
))
polc = st_sfc(pol, crs=4326)
polc.todos<-c(polc.todos,polc)
}
polc.todos
Geometry set for 10 features
Geometry type: POLYGON
Dimension: XY
Bounding box: xmin: -58.48958 ymin: -34.63767 xmax: -58.3946 ymax: -34.5734
CRS: EPSG:4326
First 5 geometries:
POLYGON ((-58.41464 -34.61696, -58.41278 -34.61...
POLYGON ((-58.4761 -34.6291, -58.47219 -34.6291...
POLYGON ((-58.40472 -34.60984, -58.40331 -34.60...
POLYGON ((-58.46544 -34.63311, -58.46437 -34.63...
POLYGON ((-58.45572 -34.62656, -58.45276 -34.62...
#poly <- Poly_Coord_df %>%
# st_as_sf(coords = c("lon", "lat"),
# crs = 4326) %>%
# st_bbox() %>%
# st_as_sfc()
# muestro la caja
#leaflet(datos.out.sf) %>% addTiles() %>% addCircleMarkers(radius = 0.1) %>% addPolygons(data=polc,color = "red")
##
# Paleta de colores
pal <- colorNumeric(palette = "RdBu", domain = c(min(datos.sf$ResidRelBal),max(datos.sf$ResidRelBal)))
# descriptive character field
datos.sf$Desc<-paste("Barrio",datos.filt$Barrio,"Sup",datos.filt$Sup,"Beds",datos.filt$Beds,"Baths",datos.filt$Baths,"<br>Precio",as.character(datos.filt$Precio),"Pred",as.character(round(datos.filt$Pred,2)),"ResRel",as.character(round(datos.filt$ResidRel,2)))
# Mapa
leaflet(datos.sf) %>% addTiles() %>% addPolygons(data=polc.todos,fillColor = "transparent",opacity = 1,weight = 2,color = "black") %>% addCircleMarkers(fillOpacity = 0.2,weight=0.1,radius=~abs(ResidRelBal*10),color = ~pal(ResidRelBal),popup = ~Desc,label=~paste("Resid",as.character(round(ResidRel,2))))
Relative Risk Analysis Overpriced Vs Underpriced
plot(datos.ppp,cex=0.3,color=2:3)
# Intensity model
ajus.RR<-ppm(datos.ppp~marks*bs(x,df=5)*bs(y,df=5),Poisson())
ajus.RR
Nonstationary multitype Poisson process
Fitted to point pattern dataset ‘datos.ppp’
Possible marks: ‘FALSE’ and ‘TRUE’
Log intensity: ~marks * bs(x, df = 5) * bs(y, df = 5)
Fitted trend coefficients:
(Intercept)
-6.411834e+01
marksTRUE
-3.706882e+01
bs(x, df = 5)1
7.002049e+01
bs(x, df = 5)2
8.976608e+01
bs(x, df = 5)3
4.793043e+01
bs(x, df = 5)4
-2.606098e+02
bs(x, df = 5)5
6.427603e+02
bs(y, df = 5)1
8.825517e+01
bs(y, df = 5)2
7.257212e+01
bs(y, df = 5)3
7.902482e+01
bs(y, df = 5)4
6.085099e+01
bs(y, df = 5)5
3.487939e+01
marksTRUE:bs(x, df = 5)1
3.618760e+01
marksTRUE:bs(x, df = 5)2
4.569252e+01
marksTRUE:bs(x, df = 5)3
6.351976e+00
marksTRUE:bs(x, df = 5)4
2.124076e+01
marksTRUE:bs(x, df = 5)5
1.050788e+01
marksTRUE:bs(y, df = 5)1
4.396347e+01
marksTRUE:bs(y, df = 5)2
3.320653e+01
marksTRUE:bs(y, df = 5)3
4.094753e+01
marksTRUE:bs(y, df = 5)4
1.567821e+01
marksTRUE:bs(y, df = 5)5
1.041981e+02
bs(x, df = 5)1:bs(y, df = 5)1
-7.856182e+01
bs(x, df = 5)2:bs(y, df = 5)1
-1.101734e+02
bs(x, df = 5)3:bs(y, df = 5)1
-7.830335e+01
bs(x, df = 5)4:bs(y, df = 5)1
2.818078e+02
bs(x, df = 5)5:bs(y, df = 5)1
-7.125855e+02
bs(x, df = 5)1:bs(y, df = 5)2
-7.187249e+01
bs(x, df = 5)2:bs(y, df = 5)2
-7.614632e+01
bs(x, df = 5)3:bs(y, df = 5)2
-4.234133e+01
bs(x, df = 5)4:bs(y, df = 5)2
2.584511e+02
bs(x, df = 5)5:bs(y, df = 5)2
-6.242877e+02
bs(x, df = 5)1:bs(y, df = 5)3
-6.710288e+01
bs(x, df = 5)2:bs(y, df = 5)3
-9.843920e+01
bs(x, df = 5)3:bs(y, df = 5)3
-4.705555e+01
bs(x, df = 5)4:bs(y, df = 5)3
2.680405e+02
bs(x, df = 5)5:bs(y, df = 5)3
-6.631659e+02
bs(x, df = 5)1:bs(y, df = 5)4
-5.290195e+01
bs(x, df = 5)2:bs(y, df = 5)4
-6.765967e+01
bs(x, df = 5)3:bs(y, df = 5)4
-1.970739e+01
bs(x, df = 5)4:bs(y, df = 5)4
2.641014e+02
bs(x, df = 5)5:bs(y, df = 5)4
-6.536511e+02
bs(x, df = 5)1:bs(y, df = 5)5
-3.991933e+01
bs(x, df = 5)2:bs(y, df = 5)5
-3.931140e+01
bs(x, df = 5)3:bs(y, df = 5)5
-4.241099e+01
bs(x, df = 5)4:bs(y, df = 5)5
-3.658795e+02
bs(x, df = 5)5:bs(y, df = 5)5
-2.041652e+04
marksTRUE:bs(x, df = 5)1:bs(y, df = 5)1
-4.102009e+01
marksTRUE:bs(x, df = 5)2:bs(y, df = 5)1
-5.398293e+01
marksTRUE:bs(x, df = 5)3:bs(y, df = 5)1
-1.939967e+01
marksTRUE:bs(x, df = 5)4:bs(y, df = 5)1
-1.231478e+01
marksTRUE:bs(x, df = 5)5:bs(y, df = 5)1
-3.191758e+01
marksTRUE:bs(x, df = 5)1:bs(y, df = 5)2
-3.379731e+01
marksTRUE:bs(x, df = 5)2:bs(y, df = 5)2
-4.189926e+01
marksTRUE:bs(x, df = 5)3:bs(y, df = 5)2
3.449503e-01
marksTRUE:bs(x, df = 5)4:bs(y, df = 5)2
-2.450107e+01
marksTRUE:bs(x, df = 5)5:bs(y, df = 5)2
-2.628498e+00
marksTRUE:bs(x, df = 5)1:bs(y, df = 5)3
-3.916466e+01
marksTRUE:bs(x, df = 5)2:bs(y, df = 5)3
-4.973021e+01
marksTRUE:bs(x, df = 5)3:bs(y, df = 5)3
-1.283295e+01
marksTRUE:bs(x, df = 5)4:bs(y, df = 5)3
-2.044721e+01
marksTRUE:bs(x, df = 5)5:bs(y, df = 5)3
-1.584080e+01
marksTRUE:bs(x, df = 5)1:bs(y, df = 5)4
-1.201737e+01
marksTRUE:bs(x, df = 5)2:bs(y, df = 5)4
-2.642168e+01
marksTRUE:bs(x, df = 5)3:bs(y, df = 5)4
1.904832e+01
marksTRUE:bs(x, df = 5)4:bs(y, df = 5)4
-6.318376e+00
marksTRUE:bs(x, df = 5)5:bs(y, df = 5)4
-3.436411e+00
marksTRUE:bs(x, df = 5)1:bs(y, df = 5)5
-1.153675e+02
marksTRUE:bs(x, df = 5)2:bs(y, df = 5)5
-1.066830e+02
marksTRUE:bs(x, df = 5)3:bs(y, df = 5)5
-8.398085e+01
marksTRUE:bs(x, df = 5)4:bs(y, df = 5)5
-7.781077e+01
marksTRUE:bs(x, df = 5)5:bs(y, df = 5)5
6.063069e+02
#plot(ajus4,pause=FALSE,superimpose = FALSE)
pred.RR<-predict(ajus.RR)
plot(pred.RR)
Calculo del Riesgo RElativo
# por separado
#CaroVsBarato<-relrisk(datos.ppp)
#CaroVsBarato
#plot(CaroVsBarato)
# Relativo
#CaroVsBarato.rel<-relrisk(datos.ppp,relative = TRUE)
#CaroVsBarato.rel
#plot(CaroVsBarato.rel)
# fijando el sigma
CaroVsBarato.sig<-relrisk(datos.ppp,sigma = 0.003)
plot(CaroVsBarato.sig)
Spatial Autocorrelation: THERE IS NO AUTOCORRELATION !!!!
set.seed(1) # setting the seed
coo1<-st_coordinates(datos.sf)
cuales<-sample(1:nrow(coo1),5000) # sample of points
coo2<-coo1[cuales,]
# Distance computation
distan<-(dist(coo2))^1
hist(distan,300)
# Weights calculation: 0-1
w <- 1/as.matrix(distan)
umbral<-0.005
w[distan<umbral]<-1
diag(w) <- 0
w[distan>=umbral]<-0
table(w)
w
0 1
24719486 280514
barplot(table(apply(w,1,sum)))
#sum(w==Inf)
#w[w==Inf]<-0
#rev(sort(as.numeric(w)))[1:100]
#hist(as.numeric(w),50)
#tope<-quantile(as.numeric(w),0.99)
#w[w>tope]<-tope
#summary(as.numeric(w))
eltest<-moran.test(datos.sf$ResidRel[cuales],mat2listw(w),randomisation = TRUE)
round(eltest$estimate,4)
Moran I statistic Expectation Variance
-0.0016 -0.0002 0.0000
(eltest$estimate[1]-eltest$estimate[2])/sqrt(eltest$estimate[3])
Moran I statistic
-0.7222594
eltest
Moran I test under randomisation
data: datos.sf$ResidRel[cuales]
weights: mat2listw(w)
Moran I statistic standard deviate = -0.72226, p-value = 0.7649
alternative hypothesis: greater
sample estimates:
Moran I statistic Expectation Variance
-1.553799e-03 -2.000400e-04 3.513146e-06
# Noran Scatter Plot
moran.plot(datos.sf$ResidRel[cuales],mat2listw(w))
# Usando Residuos Topeados y Balanceados
datos.res<-datos.filt %>% select(Lon,Lat,ResidRelBal)
lof.sco<-lof(datos.res,k=11)
Warning: lof: k is now deprecated. use minPts = 12 instead .
summary(lof.sco)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.9106 0.9951 1.0170 1.0366 1.0528 2.9170
plot(lof.sco)
datos.filt %>% filter(lof.sco>2)
Color proporcional a los Residuos Topeados Balanceados Tamaño del punto con umbral dependiente del Score
# agrego el score a la base
datos.filt$Lof.Sco<-lof.sco
# descriptive character field
datos.filt$Desc<-paste("Barrio",datos.filt$Barrio,"Sup",datos.filt$Sup,"Beds",datos.filt$Beds,"Baths",datos.filt$Baths,"<br>Precio",as.character(datos.filt$Precio),"Pred",as.character(round(datos.filt$Pred,2)),"ResRel",as.character(round(datos.filt$ResidRel,2)))
#
datos.out.sf <- datos.filt %>% st_as_sf(coords = c("Lon", "Lat"), crs = 4326)
umbral.lof<-0.990 # umbral en terminos de percentiles
pal <- colorNumeric(palette = "RdBu", domain = datos.filt$ResidRelBal)
limite<-quantile(datos.filt$Lof.Sco,umbral.lof)
reescala<-function(x){ifelse(x<limite,x*5,x*20)}
# Color proporcional a los Residuos Topeados Balanceados
# Tamaño con umbral por factor
leaflet(datos.out.sf) %>% addTiles() %>% addCircleMarkers(fillOpacity = 0.5,weight=1,radius=~reescala(Lof.Sco),color = ~pal(ResidRelBal),popup = ~Desc,label=~paste("Score",as.character(round(Lof.Sco,2))))
NA
datos.filt.iforests<-datos.res
# Modelo isolation forest
isoforest <- isolationForest$new(
sample_size = as.integer(nrow(datos.filt.iforests)/1), # cant de obs muestreadas en cada arbol
num_trees = 300, # cant de arboles
replace = FALSE, # con reemplazo ?
respect_unordered_factors = NULL,
max_depth = 24, # profundidad maxima de cada arbol
seed = 123 # semilla
)
# Entrenamiento
isoforest$fit(dataset = datos.filt.iforests)
INFO [17:24:08.223] Building Isolation Forest ...
INFO [17:24:15.332] done
INFO [17:24:15.357] Computing depth of terminal nodes ...
INFO [17:29:27.051] done
INFO [17:29:31.686] Completed growing isolation forest
predicciones <- isoforest$predict(data = datos.filt.iforests)
hist(predicciones$average_depth,50)
hist(predicciones$anomaly_score,50)
#
#escores<-scale(predicciones$anomaly_score)
ifo.sco<-predicciones$anomaly_score
peores<-rev(order(ifo.sco))
mejores<-(order(ifo.sco))
# Veo los peores
datos.filt[peores[1:50],]
# Grafico de Scores
plot(ifo.sco)
plot(datos.filt$ResidRelTop,ifo.sco)
Visualizacion de Anomalias de IForest
# adding outliers scores to data
datos.filt$Ifo.Sco<-ifo.sco
# descriptive character field
datos.filt$Desc<-paste("Barrio",datos.filt$Barrio,"Sup",datos.filt$Sup,"Beds",datos.filt$Beds,"Baths",datos.filt$Baths,"<br>Precio",as.character(datos.filt$Precio),"Pred",as.character(round(datos.filt$Pred,2)),"ResRel",as.character(round(datos.filt$ResidRel,2)))
#
datos.out.sf <- datos.filt %>% st_as_sf(coords = c("Lon", "Lat"), crs = 4326)
# with leaflet
pal <- colorNumeric(palette = "RdBu", domain = datos.filt$ResidRelBal)
umbral.ifo<-0.990
limite<-quantile(datos.filt$Ifo.Sco,umbral.ifo)
reescala<-function(x){ifelse(x<limite,x*10,x*40)}
#reescala<-function(x){ifelse(x<0.55,x*1,x*20)}
#
leaflet(datos.out.sf) %>% addTiles() %>% addCircleMarkers(fillOpacity = 0.5,weight=1,radius=~reescala(Ifo.Sco),color = ~pal(ResidRelBal),popup = ~Desc,label=~paste("Score",as.character(round(ifo.sco,2))))
NA
leaflet(datos.out.sf) %>% addTiles() %>% addCircleMarkers(fillOpacity = 0.1,weight=1,radius=0.5,color = "blue")
NA